JASON STOPAS
MODULE 04: LAB 01

In [1]:
import gdown
import polars as pl
import pandas as pd
import numpy as np
import sqlite3
import plotly.express as px
import plotly.graph_objects as go
In [2]:
drive_loc = 'https://drive.google.com/uc?id=1V2GCHGt2dkFGqVBeoUFckU4IhUgk4ocQ'

download_LC = 'downloaded_file.csv'
gdown.download(drive_loc, download_LC, quiet=False)

Jobs = pl.read_csv(download_LC)
Downloading...
From (original): https://drive.google.com/uc?id=1V2GCHGt2dkFGqVBeoUFckU4IhUgk4ocQ
From (redirected): https://drive.google.com/uc?id=1V2GCHGt2dkFGqVBeoUFckU4IhUgk4ocQ&confirm=t&uuid=1d8c7256-c988-4711-b880-980200dc25ab
To: C:\Users\jtsto\OneDrive\Documents\03 BU SPRING 2025\AD 688\Mod04_Lab01\downloaded_file.csv
100%|██████████| 717M/717M [01:04<00:00, 11.1MB/s] 
In [3]:
Jobs.head()
Out[3]:
shape: (5, 131)
IDLAST_UPDATED_DATELAST_UPDATED_TIMESTAMPDUPLICATESPOSTEDEXPIREDDURATIONSOURCE_TYPESSOURCESURLACTIVE_URLSACTIVE_SOURCES_INFOTITLE_RAWBODYMODELED_EXPIREDMODELED_DURATIONCOMPANYCOMPANY_NAMECOMPANY_RAWCOMPANY_IS_STAFFINGEDUCATION_LEVELSEDUCATION_LEVELS_NAMEMIN_EDULEVELSMIN_EDULEVELS_NAMEMAX_EDULEVELSMAX_EDULEVELS_NAMEEMPLOYMENT_TYPEEMPLOYMENT_TYPE_NAMEMIN_YEARS_EXPERIENCEMAX_YEARS_EXPERIENCEIS_INTERNSHIPSALARYREMOTE_TYPEREMOTE_TYPE_NAMEORIGINAL_PAY_PERIODSALARY_TOSALARY_FROM…SOC_2021_5_NAMELOT_CAREER_AREALOT_CAREER_AREA_NAMELOT_OCCUPATIONLOT_OCCUPATION_NAMELOT_SPECIALIZED_OCCUPATIONLOT_SPECIALIZED_OCCUPATION_NAMELOT_OCCUPATION_GROUPLOT_OCCUPATION_GROUP_NAMELOT_V6_SPECIALIZED_OCCUPATIONLOT_V6_SPECIALIZED_OCCUPATION_NAMELOT_V6_OCCUPATIONLOT_V6_OCCUPATION_NAMELOT_V6_OCCUPATION_GROUPLOT_V6_OCCUPATION_GROUP_NAMELOT_V6_CAREER_AREALOT_V6_CAREER_AREA_NAMESOC_2SOC_2_NAMESOC_3SOC_3_NAMESOC_4SOC_4_NAMESOC_5SOC_5_NAMELIGHTCAST_SECTORSLIGHTCAST_SECTORS_NAMENAICS_2022_2NAICS_2022_2_NAMENAICS_2022_3NAICS_2022_3_NAMENAICS_2022_4NAICS_2022_4_NAMENAICS_2022_5NAICS_2022_5_NAMENAICS_2022_6NAICS_2022_6_NAME
strstrstri64strstri64strstrstrstrstrstrstrstri64i64strstrboolstrstri64stri64stri64stri64i64booli64i64strstri64i64…stri64stri64stri64stri64stri64stri64stri64stri64strstrstrstrstrstrstrstrstrstrstri64stri64stri64stri64stri64str
"1f57d95acf4dc67ed2819eb12f049f…"9/6/2024""2024-09-06 20:32:57.352 Z"0"6/2/2024""6/8/2024"6"[   "Company" ]""[   "brassring.com" ]""[   "https://sjobs.brassring.c…"[]"null"Enterprise Analyst (II-III)""31-May-2024 Enterprise Analys…"6/8/2024"6894731"Murphy USA""Murphy USA"false"[   2 ]""[   "Bachelor's degree" ]"2"Bachelor's degree"nullnull1"Full-time (> 32 hours)"22falsenull0"[None]"nullnullnull…"Data Scientists"23"Information Technology and Com…231010"Business Intelligence Analyst"23101011"General ERP Analyst / Consulta…2310"Business Intelligence"23101011"General ERP Analyst / Consulta…231010"Business Intelligence Analyst"2310"Business Intelligence"23"Information Technology and Com…"15-0000""Computer and Mathematical Occu…"15-2000""Mathematical Science Occupatio…"15-2050""Data Scientists""15-2051""Data Scientists""[   7 ]""[   "Artificial Intelligence" …44"Retail Trade"441"Motor Vehicle and Parts Dealer…4413"Automotive Parts, Accessories,…44133"Automotive Parts and Accessori…441330"Automotive Parts and Accessori…
"0cb072af26757b6c4ea9464472a50a…"8/2/2024""2024-08-02 17:08:58.838 Z"0"6/2/2024""8/1/2024"null"[   "Job Board" ]""[   "maine.gov" ]""[   "https://joblink.maine.gov…"[]"null"Oracle Consultant - Reports (3…"Oracle Consultant - Reports (3…"8/1/2024"null133098"Smx Corporation Limited""SMX"true"[   99 ]""[   "No Education Listed" ]"99"No Education Listed"nullnull1"Full-time (> 32 hours)"33falsenull1"Remote"nullnullnull…"Data Scientists"23"Information Technology and Com…231010"Business Intelligence Analyst"23101012"Oracle Consultant / Analyst"2310"Business Intelligence"23101012"Oracle Consultant / Analyst"231010"Business Intelligence Analyst"2310"Business Intelligence"23"Information Technology and Com…"15-0000""Computer and Mathematical Occu…"15-2000""Mathematical Science Occupatio…"15-2050""Data Scientists""15-2051""Data Scientists"nullnull56"Administrative and Support and…561"Administrative and Support Ser…5613"Employment Services"56132"Temporary Help Services"561320"Temporary Help Services"
"85318b12b3331fa490d32ad014379d…"9/6/2024""2024-09-06 20:32:57.352 Z"1"6/2/2024""7/7/2024"35"[   "Job Board" ]""[   "dejobs.org" ]""[   "https://dejobs.org/dallas…"[]"null"Data Analyst""Taking care of people is at th…"6/10/2024"839063746"Sedgwick""Sedgwick"false"[   2 ]""[   "Bachelor's degree" ]"2"Bachelor's degree"nullnull1"Full-time (> 32 hours)"5nullfalsenull0"[None]"nullnullnull…"Data Scientists"23"Information Technology and Com…231113"Data / Data Mining Analyst"23111310"Data Analyst"2311"Data Analysis and Mathematics"23111310"Data Analyst"231113"Data / Data Mining Analyst"2311"Data Analysis and Mathematics"23"Information Technology and Com…"15-0000""Computer and Mathematical Occu…"15-2000""Mathematical Science Occupatio…"15-2050""Data Scientists""15-2051""Data Scientists"nullnull52"Finance and Insurance"524"Insurance Carriers and Related…5242"Agencies, Brokerages, and Othe…52429"Other Insurance Related Activi…524291"Claims Adjusting"
"1b5c3941e54a1889ef4f8ae55b401a…"9/6/2024""2024-09-06 20:32:57.352 Z"1"6/2/2024""7/20/2024"48"[   "Job Board" ]""[   "disabledperson.com",   "d…"[   "https://www.disabledperso…"[]"null"Sr. Lead Data Mgmt. Analyst - …"About this role: Wells Fargo …"6/12/2024"1037615159"Wells Fargo""Wells Fargo"false"[   99 ]""[   "No Education Listed" ]"99"No Education Listed"nullnull1"Full-time (> 32 hours)"3nullfalsenull0"[None]"nullnullnull…"Data Scientists"23"Information Technology and Com…231113"Data / Data Mining Analyst"23111310"Data Analyst"2311"Data Analysis and Mathematics"23111310"Data Analyst"231113"Data / Data Mining Analyst"2311"Data Analysis and Mathematics"23"Information Technology and Com…"15-0000""Computer and Mathematical Occu…"15-2000""Mathematical Science Occupatio…"15-2050""Data Scientists""15-2051""Data Scientists""[   6 ]""[   "Data Privacy/Protection" …52"Finance and Insurance"522"Credit Intermediation and Rela…5221"Depository Credit Intermediati…52211"Commercial Banking"522110"Commercial Banking"
"cb5ca25f02bdf25c13edfede793150…"6/19/2024""2024-06-19 07:00:00.000 Z"0"6/2/2024""6/17/2024"15"[   "FreeJobBoard" ]""[   "craigslist.org" ]""[   "https://modesto.craigslis…"[]"null"Comisiones de $1000 - $3000 po…"Comisiones de $1000 - $3000 po…"6/17/2024"150"Unclassified""LH/GM"false"[   99 ]""[   "No Education Listed" ]"99"No Education Listed"nullnull3"Part-time / full-time"nullnullfalse925000"[None]""year"15000035000…"Data Scientists"23"Information Technology and Com…231010"Business Intelligence Analyst"23101012"Oracle Consultant / Analyst"2310"Business Intelligence"23101012"Oracle Consultant / Analyst"231010"Business Intelligence Analyst"2310"Business Intelligence"23"Information Technology and Com…"15-0000""Computer and Mathematical Occu…"15-2000""Mathematical Science Occupatio…"15-2050""Data Scientists""15-2051""Data Scientists"nullnull99"Unclassified Industry"999"Unclassified Industry"9999"Unclassified Industry"99999"Unclassified Industry"999999"Unclassified Industry"
In [4]:
Jobs.shape
Out[4]:
(72498, 131)
In [5]:
columns = Jobs.columns

for col in columns:
    print(col)
ID
LAST_UPDATED_DATE
LAST_UPDATED_TIMESTAMP
DUPLICATES
POSTED
EXPIRED
DURATION
SOURCE_TYPES
SOURCES
URL
ACTIVE_URLS
ACTIVE_SOURCES_INFO
TITLE_RAW
BODY
MODELED_EXPIRED
MODELED_DURATION
COMPANY
COMPANY_NAME
COMPANY_RAW
COMPANY_IS_STAFFING
EDUCATION_LEVELS
EDUCATION_LEVELS_NAME
MIN_EDULEVELS
MIN_EDULEVELS_NAME
MAX_EDULEVELS
MAX_EDULEVELS_NAME
EMPLOYMENT_TYPE
EMPLOYMENT_TYPE_NAME
MIN_YEARS_EXPERIENCE
MAX_YEARS_EXPERIENCE
IS_INTERNSHIP
SALARY
REMOTE_TYPE
REMOTE_TYPE_NAME
ORIGINAL_PAY_PERIOD
SALARY_TO
SALARY_FROM
LOCATION
CITY
CITY_NAME
COUNTY
COUNTY_NAME
MSA
MSA_NAME
STATE
STATE_NAME
COUNTY_OUTGOING
COUNTY_NAME_OUTGOING
COUNTY_INCOMING
COUNTY_NAME_INCOMING
MSA_OUTGOING
MSA_NAME_OUTGOING
MSA_INCOMING
MSA_NAME_INCOMING
NAICS2
NAICS2_NAME
NAICS3
NAICS3_NAME
NAICS4
NAICS4_NAME
NAICS5
NAICS5_NAME
NAICS6
NAICS6_NAME
TITLE
TITLE_NAME
TITLE_CLEAN
SKILLS
SKILLS_NAME
SPECIALIZED_SKILLS
SPECIALIZED_SKILLS_NAME
CERTIFICATIONS
CERTIFICATIONS_NAME
COMMON_SKILLS
COMMON_SKILLS_NAME
SOFTWARE_SKILLS
SOFTWARE_SKILLS_NAME
ONET
ONET_NAME
ONET_2019
ONET_2019_NAME
CIP6
CIP6_NAME
CIP4
CIP4_NAME
CIP2
CIP2_NAME
SOC_2021_2
SOC_2021_2_NAME
SOC_2021_3
SOC_2021_3_NAME
SOC_2021_4
SOC_2021_4_NAME
SOC_2021_5
SOC_2021_5_NAME
LOT_CAREER_AREA
LOT_CAREER_AREA_NAME
LOT_OCCUPATION
LOT_OCCUPATION_NAME
LOT_SPECIALIZED_OCCUPATION
LOT_SPECIALIZED_OCCUPATION_NAME
LOT_OCCUPATION_GROUP
LOT_OCCUPATION_GROUP_NAME
LOT_V6_SPECIALIZED_OCCUPATION
LOT_V6_SPECIALIZED_OCCUPATION_NAME
LOT_V6_OCCUPATION
LOT_V6_OCCUPATION_NAME
LOT_V6_OCCUPATION_GROUP
LOT_V6_OCCUPATION_GROUP_NAME
LOT_V6_CAREER_AREA
LOT_V6_CAREER_AREA_NAME
SOC_2
SOC_2_NAME
SOC_3
SOC_3_NAME
SOC_4
SOC_4_NAME
SOC_5
SOC_5_NAME
LIGHTCAST_SECTORS
LIGHTCAST_SECTORS_NAME
NAICS_2022_2
NAICS_2022_2_NAME
NAICS_2022_3
NAICS_2022_3_NAME
NAICS_2022_4
NAICS_2022_4_NAME
NAICS_2022_5
NAICS_2022_5_NAME
NAICS_2022_6
NAICS_2022_6_NAME

filter for those with salary info

In [6]:
Jobs_w_salary_info = Jobs.filter(
    ~(
        Jobs["SALARY"].is_null() |
        Jobs["SALARY_TO"].is_null() |
        Jobs["SALARY_FROM"].is_null()
    )
)
In [7]:
Jobs_w_salary_info.select(["SALARY_FROM", "SALARY_TO", "SALARY"]).describe()
Out[7]:
shape: (9, 4)
statisticSALARY_FROMSALARY_TOSALARY
strf64f64f64
"count"30808.030808.030808.0
"null_count"0.00.00.0
"mean"96108.560277139349.489938117953.755031
"std"38651.51593557867.60604645133.878359
"min"10230.016640.015860.0
"25%"66560.096471.084933.0
"50%"90000.0135000.0116300.0
"75%"119800.0178200.0145600.0
"max"500000.0500000.0500000.0
In [8]:
Jobs_w_salary_info.shape
Out[8]:
(30808, 131)
In [9]:
Jobs_w_salary_info.select(
    pl.col("EMPLOYMENT_TYPE_NAME").value_counts()
)
Out[9]:
shape: (3, 1)
EMPLOYMENT_TYPE_NAME
struct[2]
{"Part-time (≤ 32 hours)",1038}
{"Full-time (> 32 hours)",29151}
{"Part-time / full-time",619}
In [10]:
Jobs_w_salary_info.select(
    pl.col("EMPLOYMENT_TYPE").value_counts()
)
Out[10]:
shape: (3, 1)
EMPLOYMENT_TYPE
struct[2]
{2,1038}
{1,29151}
{3,619}
In [11]:
Jobs_w_salary_info.group_by("EMPLOYMENT_TYPE_NAME").agg([
    pl.len().alias("Job_Count"),
    pl.col("SALARY").mean().alias("Avg_Salary"),
    pl.col("SALARY").median().alias("Median_Salary"),
    pl.col("SALARY").min().alias("Min_Salary"),
    pl.col("SALARY").max().alias("Max_Salary"),
    pl.col("SALARY").std().alias("Salary_StdDev")
]).sort("Avg_Salary", descending=True)
Out[11]:
shape: (3, 7)
EMPLOYMENT_TYPE_NAMEJob_CountAvg_SalaryMedian_SalaryMin_SalaryMax_SalarySalary_StdDev
stru32f64f64i64i64f64
"Full-time (> 32 hours)"29151118897.558609116500.02058350000044351.533443
"Part-time / full-time"619105621.242326100000.02080045537552979.422642
"Part-time (≤ 32 hours)"103898802.50963486390.01586031005055382.720356
In [12]:
df_plot = Jobs_w_salary_info.select(["EMPLOYMENT_TYPE_NAME", "SALARY_FROM"]).to_pandas()

employment_types = df_plot["EMPLOYMENT_TYPE_NAME"].unique()

plot_01 = go.Figure()

for emp_type in employment_types:
    plot_01.add_trace(
        go.Box(
            y=df_plot[df_plot["EMPLOYMENT_TYPE_NAME"] == emp_type]["SALARY_FROM"],
            name=emp_type,
            boxpoints='outliers',
            marker_color='lightgray',
            line_color='black',
            showlegend=False
        )
    )

plot_01.add_trace(
    go.Scatter(
        x=df_plot["EMPLOYMENT_TYPE_NAME"],
        y=df_plot["SALARY_FROM"],
        mode='markers',
        marker=dict(
            color=df_plot["SALARY_FROM"],
            colorscale='Viridis',
            showscale=True,
            size=6,
            opacity=0.7,
            colorbar=dict(title="Starting Salary")
        ),
        name='Salary Points',
        hoverinfo='x+y'
    )
)

plot_01.update_layout(
    title="Starting Salary Distribution by Employment Type",
    xaxis_title="Employment Type",
    yaxis_title="Starting Salary",
    xaxis_tickangle=-45,
    template='plotly_white',
    yaxis=dict(
        tick0=0,
        dtick=50000,
        gridcolor="lightgray"
    ),
    margin=dict(t=80, b=100, l=80, r=40),
    boxmode='group',
    boxgap=0.4,
    height=600,
    width=1000
)


plot_01.show()

What this image shows is that employees in full-time roles generally earn more than those with part-time jobs. This result is also intuitive and makes logical sense. It is unclear what the category of ‘Part time / full-time’ represents

In [13]:
df_plot = Jobs_w_salary_info.select(["NAICS2_NAME", "SALARY_FROM"]).to_pandas()

industries = df_plot["NAICS2_NAME"].unique()

plot_02 = go.Figure()

for industry in industries:
    plot_02.add_trace(
        go.Box(
            y=df_plot[df_plot["NAICS2_NAME"] == industry]["SALARY_FROM"],
            name=industry,
            boxpoints='outliers',
            marker_color='lightgray',
            line_color='black',
            showlegend=False
        )
    )

plot_02.add_trace(
    go.Scatter(
        x=df_plot["NAICS2_NAME"],
        y=df_plot["SALARY_FROM"],
        mode='markers',
        marker=dict(
            color=df_plot["SALARY_FROM"],
            colorscale='Turbo', 
            showscale=True,
            size=6,
            opacity=0.7,
            colorbar=dict(title="Starting Salary")
        ),
        name='Salary Points',
        hoverinfo='x+y'
    )
)

plot_02.update_layout(
    title="Starting Salary Distribution by Industry (NAICS2_NAME)",
    xaxis_title="Industry (NAICS2)",
    yaxis_title="Starting Salary",
    xaxis_tickangle=-45,
    template='plotly_white',
    yaxis=dict(
        tick0=0,
        dtick=50000,       
        gridcolor="lightgray"
    ),
    margin=dict(
        t=80,  
        b=120, 
        l=80,
        r=40
    ),
    boxmode='group',
    boxgap=0.4,  
    height=700,  
    width=1200   
)

plot_02.show()

The plot shows the STARTING_FROM salary for the various industries indicated in the NAICS2_NAME column. One can see some very high starting ‘from’ salaries in the administrative and support and waste management and remediation services category, and also health care and social assistance.

In [14]:
Jobs_with_dates = Jobs.with_columns(
    pl.col("POSTED").cast(pl.Utf8).str.strptime(pl.Date, "%m/%d/%Y", strict=False).alias("POSTED_DATE")
)

daily_postings = (
    Jobs_with_dates
    .group_by("POSTED_DATE")
    .agg(pl.len().alias("Post_Count"))
    .sort("POSTED_DATE")
)

df_line = daily_postings.to_pandas()

df_line["Smoothed"] = df_line["Post_Count"].rolling(window=7, center=True).mean()
In [15]:
plot_03 = go.Figure()

# Raw daily line (gray)
plot_03.add_trace(go.Scatter(
    x=df_line["POSTED_DATE"],
    y=df_line["Post_Count"],
    mode="lines",
    name="Raw Daily Count",
    line=dict(color="lightgray", width=2)
))

# Smoothed line (red)
plot_03.add_trace(go.Scatter(
    x=df_line["POSTED_DATE"],
    y=df_line["Smoothed"],
    mode="lines",
    name="7-Day Smoothed",
    line=dict(color="red", width=3)
))

# Layout and formatting
plot_03.update_layout(
    title="Job Postings Over Time (Raw + 7-Day Smoothed)",
    xaxis_title="Date",
    yaxis_title="Number of Postings",
    template="plotly_white",
    height=500,
    width=1000,
    xaxis_tickangle=-45,
    margin=dict(t=60, b=100, l=60, r=40),
    legend=dict(x=0.01, y=0.99, bordercolor="gray", borderwidth=1)
)

plot_03.show()
In [16]:
Jobs_with_dates.select([
    pl.col("POSTED_DATE").min().alias("Earliest"),
    pl.col("POSTED_DATE").max().alias("Latest")
])
Out[16]:
shape: (1, 2)
EarliestLatest
datedate
2024-05-012024-09-30

This shows the jobs posted per day in the dataset, indicating a general upward trend. While the top day of the week varies slightly from day to day the lowest days (amount posted) are, in general, always on Sundays.

In [17]:
top_titles = (
    Jobs.group_by("TITLE_NAME")
        .agg(pl.len().alias("Count"))
        .sort("Count", descending=True)
        .limit(10)
)
In [18]:
df_top_titles = top_titles.to_pandas()
In [19]:
plot_04 = px.bar(
    df_top_titles,
    x="TITLE_NAME",
    y="Count",
    color="TITLE_NAME",  # Different color per title
    title="Top 10 Job Titles by Number of Postings",
    labels={"TITLE_NAME": "Job Title", "Count": "Number of Postings"},
    template="plotly_white"
)

plot_04.update_layout(
    xaxis_tickangle=-45,
    height=500,
    width=1000,
    showlegend=False,  # Optional: hide legend since x-axis already shows titles
    margin=dict(t=60, b=100, l=60, r=40)
)

plot_04.show()

This bar chart is generally easy to read and shows the viewer that ‘Data Analysts’ is the top job title in the dataset, by a lot...

In [20]:
Jobs.select("REMOTE_TYPE_NAME").unique()
Out[20]:
shape: (5, 1)
REMOTE_TYPE_NAME
str
"Hybrid Remote"
"Not Remote"
"Remote"
null
"[None]"
In [21]:
Jobs_cleaned_remote = Jobs.with_columns(
    pl.when(
        pl.col("REMOTE_TYPE_NAME").is_null() | 
        (pl.col("REMOTE_TYPE_NAME").cast(pl.Utf8).str.strip_chars("[]") == "None")
    )
    .then(pl.lit("Unknown"))
    .otherwise(pl.col("REMOTE_TYPE_NAME"))
    .alias("REMOTE_TYPE_NAME")
)
In [22]:
Jobs_cleaned_remote.select("REMOTE_TYPE_NAME").unique()
Out[22]:
shape: (4, 1)
REMOTE_TYPE_NAME
str
"Hybrid Remote"
"Remote"
"Not Remote"
"Unknown"
In [23]:
remote_counts = (
    Jobs_cleaned_remote
    .group_by("REMOTE_TYPE_NAME")
    .agg(pl.len().alias("Count"))
    .sort("Count", descending=True)
)
In [24]:
df_remote = remote_counts.to_pandas()

plot_05 = px.pie(
    df_remote,
    names="REMOTE_TYPE_NAME",
    values="Count",
    title="Distribution of Remote Work Types",
    color_discrete_sequence=px.colors.qualitative.Set3
)

plot_05.update_traces(textposition='inside', textinfo='percent+label')

plot_05.update_layout(
    template="plotly_white",
    margin=dict(t=60, b=60, l=60, r=60)
)

plot_05.show()

It appears that in this case a lot of the information is missing concerning whether a job posting has indicated if it is remote or not (with 'None' being the major category). For the purposes of the pie chart, I’m showing the main category of 'None' (along with the few NaN values in there) as Unknown.

In [25]:
Jobs.select("SKILLS_NAME").unique().shape
Out[25]:
(44173, 1)
In [26]:
Jobs.select("SKILLS_NAME").unique().head(20)
Out[26]:
shape: (20, 1)
SKILLS_NAME
str
"[   "Management",   "Coaching"…
"[   "Systems Development Life …
"[   "Business Objectives",   "…
"[   "Presentations",   "Office…
"[   "Ansys Simulation Software…
…
"[   "Management",   "Microsoft…
"[   "Enterprise Resource Plann…
"[   "Workflow Management",   "…
"[   "Bed Management",   "Healt…
"[   "Mathematics",   "Collecti…
In [27]:
exploded = Jobs.with_columns(
    pl.col("SKILLS_NAME").cast(pl.List(pl.Utf8))
).explode("SKILLS_NAME")
In [28]:
skills_per_industry = (
    exploded
    .group_by("NAICS_2022_6_NAME")
    .agg(pl.len().alias("Skill_Count"))
    .sort("Skill_Count", descending=True)
    .limit(15)
)
In [29]:
df_industry_skills = skills_per_industry.to_pandas()
In [30]:
plot_06 = px.bar(
    df_industry_skills,
    x="NAICS_2022_6_NAME",
    y="Skill_Count",
    title="Top 15 Industries by Skill Mentions",
    labels={
        "NAICS_2022_6_NAME": "Industry",
        "Skill_Count": "Total Skill Mentions"
    },
    template="plotly_white",
    color="NAICS_2022_6_NAME", 
    color_discrete_sequence=px.colors.qualitative.Pastel 
)

plot_06.update_layout(
    xaxis_tickangle=-45,
    height=700,
    width=1200,
    margin=dict(t=60, b=140, l=60, r=40),
    showlegend=False
)

plot_06.show()

There are 44173 skills in the SKILLS_NAME column. To make this plot readable and help to understand the question I’m showing the top 15 industries by count of skill mentions

In [31]:
Jobs.select("ONET_NAME").unique()
Out[31]:
shape: (2, 1)
ONET_NAME
str
null
"Business Intelligence Analysts"
In [32]:
Jobs_w_salary_info.select("ONET_NAME").unique()
Out[32]:
shape: (1, 1)
ONET_NAME
str
"Business Intelligence Analysts"
In [33]:
Jobs_w_salary_info.filter(
    pl.col("ONET_NAME") == "Business Intelligence Analysts"
).select(
    pl.median("SALARY").alias("Median_Salary")
)
Out[33]:
shape: (1, 1)
Median_Salary
f64
116300.0
In [34]:
plot_07 = px.bar(
    x=["Business Intelligence Analysts"],
    y=[Jobs_w_salary_info.filter(pl.col("ONET_NAME") == "Business Intelligence Analysts")
       .select(pl.median("SALARY"))[0, 0]],
    labels={"x": "[ONET] Occupation", "y": "Median Salary"},
    title="Median Salary: Business Intelligence Analysts",
    template="plotly_white"
)

plot_07.update_traces(marker_color='teal')

plot_07.show()

The only ONET_NAME category, when filtered for salary information, is Business Intelligence Analysists. When not filtered for salary information it’s Business Intelligence Analysists and null.

In [35]:
Jobs.select("SOC_2021_2_NAME").unique()
Out[35]:
shape: (2, 1)
SOC_2021_2_NAME
str
null
"Computer and Mathematical Occu…
In [36]:
Jobs.group_by("SOC_2021_2_NAME").agg(
    pl.len().alias("Post_Count")
).sort("Post_Count", descending=True)
Out[36]:
shape: (2, 2)
SOC_2021_2_NAMEPost_Count
stru32
"Computer and Mathematical Occu…72454
null44
In [37]:
Jobs.select("SOC_2021_3_NAME").unique()
Out[37]:
shape: (2, 1)
SOC_2021_3_NAME
str
"Mathematical Science Occupatio…
null
In [38]:
Jobs.group_by("SOC_2021_3_NAME").agg(
    pl.len().alias("Post_Count")
).sort("Post_Count", descending=True)
Out[38]:
shape: (2, 2)
SOC_2021_3_NAMEPost_Count
stru32
"Mathematical Science Occupatio…72454
null44
In [39]:
transitions = (
    Jobs.group_by(["SOC_2021_2_NAME", "SOC_2021_3_NAME"])
    .agg(pl.len().alias("Count"))
    .filter(pl.col("SOC_2021_2_NAME").is_not_null() & pl.col("SOC_2021_3_NAME").is_not_null())
)
In [40]:
df_transitions = transitions.to_pandas()
In [41]:
all_labels = pd.unique(
    np.concatenate([
        df_transitions["SOC_2021_2_NAME"].values,
        df_transitions["SOC_2021_3_NAME"].values
    ])
)

label_to_index = {label: idx for idx, label in enumerate(all_labels)}

df_transitions["source"] = df_transitions["SOC_2021_2_NAME"].map(label_to_index)
df_transitions["target"] = df_transitions["SOC_2021_3_NAME"].map(label_to_index)
In [42]:
plot_08 = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=list(label_to_index.keys()),
        color="lightblue"
    ),
    link=dict(
        source=df_transitions["source"],
        target=df_transitions["target"],
        value=df_transitions["Count"]
    )
)])

plot_08.update_layout(
    title_text="SOC Transitions: 2-Digit → 3-Digit Level",
    font_size=12,
    height=700,
    width=1000
)

plot_08.show()

Same thing here it appears that both SOC_2021_2_NAME and SOC_2021_3_NAME contain the entire dataset, making a plot of this kind generally uninformative.